{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 金融 证券 图谱搭建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-28T04:42:41.694335Z",
     "start_time": "2020-01-28T04:42:37.616242Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import hashlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_md5(string):\n",
    "    \"\"\"\n",
    "    Get md5 according to the string\n",
    "    \"\"\"\n",
    "    byte_string = string.encode(\"utf-8\")\n",
    "    md5 = hashlib.md5()\n",
    "    md5.update(byte_string)\n",
    "    result = md5.hexdigest()\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_update_data(filename,path_origin,path_new,labels,keys,names,types = None):\n",
    "    \"\"\"\n",
    "    筛选需要的节点和关系数据\n",
    "    \"\"\"\n",
    "    data_origin = pd.read_csv(path_origin + filename,header=None,names=names,dtype=str)\n",
    "    data_new = pd.read_csv(path_new + filename,header=None,names=names,dtype=str)\n",
    "    \n",
    "    \n",
    "    # 拼接数据\n",
    "    if types == 'rel_manager':\n",
    "        # 主键 hash 提高拼接速度\n",
    "        data_origin['hash_key'] = data_origin.apply(lambda x:get_md5('{}-{}-{}-{}'.format(x['fullname'],x['hash_cust'], x['title'],x['begin_date'])),axis=1)\n",
    "        data_new['hash_key'] = data_new.apply(lambda x:get_md5('{}-{}-{}-{}'.format(x['fullname'], x['hash_cust'],x['title'],x['begin_date'])),axis=1)\n",
    "        data = pd.merge(data_new,data_origin,on = ['hash_key'],how = 'left')\n",
    "        data = data.rename(columns={'fullname_x':'fullname','hash_cust_x':'hash_cust','title_x':'title','begin_date_x':'begin_date'})\n",
    "    else:\n",
    "        data = pd.merge(data_new,data_origin,on = keys,how = 'left')\n",
    "    \n",
    "    names1 = list(set(names) - set(keys))\n",
    "    \n",
    "    for c in keys:\n",
    "        data = data[data[c].notnull()]\n",
    "\n",
    "    for c in names:\n",
    "        if ( c not in keys)&(c != labels):\n",
    "            # print(c)\n",
    "            data[c] = data.apply(lambda x: x[c+'_x'] if x[c+'_x']!=x[c+'_y'] else None ,axis = 1)\n",
    "    data[labels] = data[labels + '_x']\n",
    "    data['tags'] =  data.apply(lambda x: sum([1 for t in names1 if pd.notnull(x[t])]) ,axis = 1) \n",
    "\n",
    "    te = data.loc[data['tags'] != 1 ,names].drop_duplicates().fillna(\"NotChange\")\n",
    "    \n",
    "    # 保存数据\n",
    "    te.to_csv(out_path+filename, index =False)\n",
    "    print(te.shape)\n",
    "    print('-'*20)\n",
    "    print(data.tags.value_counts())\n",
    "    return te"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_path = '../data_update/gap/'\n",
    "\n",
    "path_origin = '../data/output/'\n",
    "\n",
    "path_new = '../data_update/output/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: cannot create directory ‘../data_update/gap/’: File exists\r\n"
     ]
    }
   ],
   "source": [
    "!mkdir ../data_update/gap/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-26T14:58:19.217230Z",
     "start_time": "2020-01-26T14:58:19.208235Z"
    }
   },
   "source": [
    "### 公司\n",
    "合并 处理 基金管理人\\ 基金托管人\\ 上市公司\\"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'node_companies.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'label'\n",
    "keys = ['fullname']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['fullname', 'name', 'symbol', 'market', 'exchange', 'list_status', 'list_date',\n",
    "         'delist_date', 'setup_date', 'label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(49, 10)\n",
      "--------------------\n",
      "1    18955\n",
      "8       37\n",
      "2        7\n",
      "3        5\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 节点：行业"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'node_industries.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'label'\n",
    "keys = ['industry']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['industry', 'label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 2)\n",
      "--------------------\n",
      "1    110\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 关系：股票-->行业"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'rel_share_in_industry.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'type'\n",
    "keys = ['fullname', 'industry']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['fullname', 'industry', 'type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 3)\n",
      "--------------------\n",
      "1    3766\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 节点：省份"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'node_province.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'label'\n",
    "keys = ['province']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['province', 'label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 2)\n",
      "--------------------\n",
      "1    32\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 节点：城市"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'node_city.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'label'\n",
    "keys = ['city']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['city', 'label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 2)\n",
      "--------------------\n",
      "1    341\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 关系：公司-->城市"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'rel_company_in_city.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'type'\n",
    "keys = ['fullname', 'city']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['fullname', 'city', 'type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 3)\n",
      "--------------------\n",
      "1    18964\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 关系：城市-->省份"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'rel_city_in_province.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "keys = ['city', 'province']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'type'\n",
    "names = ['city', 'province', 'type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 3)\n",
      "--------------------\n",
      "1    341\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 节点：人（上市公司董事高管）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'node_managers.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'label'\n",
    "keys = ['hash_cust']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['hash_cust', 'name', 'gender', 'edu', 'national', 'birthday', 'label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 7)\n",
      "--------------------\n",
      "1    163613\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 关系：公司（上市公司）--> 人(董事高管)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对于 公司与 高管之间的关系，不同与上述其他节点之间关系，会存在多边的情况存在。因为一个管理者在一个任期中，会存在以下两种情况\n",
    "- 任期结束--> 更新结束时间\n",
    "- 职位调整 --> 更新结束时间 新建一条边。\n",
    "\n",
    "针对，复杂性对于这部分情况的数据更新需要单独处理。\n",
    "\n",
    "即将通过公司（company）、人（manger_id）、职位(title)、上任时间（start_date）四个变量去定位一条边。\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![](pictures/4b9ce4c5-c9ca-4e10-ba98-6deced93d145.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'rel_listed_company_has_manager.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'lev'\n",
    "keys = ['fullname','hash_cust','title','begin_date']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['fullname', 'hash_cust', 'begin_date', 'end_date', 'title', 'lev']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1251, 6)\n",
      "--------------------\n",
      "1    429847\n",
      "2      1251\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names, types='rel_manager')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fullname</th>\n",
       "      <th>hash_cust</th>\n",
       "      <th>begin_date</th>\n",
       "      <th>end_date</th>\n",
       "      <th>title</th>\n",
       "      <th>lev</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5240</th>\n",
       "      <td>首航高科能源技术股份有限公司</td>\n",
       "      <td>f756ce1cd2959c31aefbf00613ce741b</td>\n",
       "      <td>20101214</td>\n",
       "      <td>20131107</td>\n",
       "      <td>薪酬与考核委员会委员</td>\n",
       "      <td>其他</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5241</th>\n",
       "      <td>首航高科能源技术股份有限公司</td>\n",
       "      <td>f756ce1cd2959c31aefbf00613ce741b</td>\n",
       "      <td>20101214</td>\n",
       "      <td>20131107</td>\n",
       "      <td>薪酬与考核委员会主任</td>\n",
       "      <td>其他</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5242</th>\n",
       "      <td>首航高科能源技术股份有限公司</td>\n",
       "      <td>f756ce1cd2959c31aefbf00613ce741b</td>\n",
       "      <td>20101214</td>\n",
       "      <td>20131107</td>\n",
       "      <td>独立董事</td>\n",
       "      <td>董事会成员</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5243</th>\n",
       "      <td>首航高科能源技术股份有限公司</td>\n",
       "      <td>f756ce1cd2959c31aefbf00613ce741b</td>\n",
       "      <td>20101214</td>\n",
       "      <td>20131107</td>\n",
       "      <td>提名委员会委员</td>\n",
       "      <td>其他</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5244</th>\n",
       "      <td>首航高科能源技术股份有限公司</td>\n",
       "      <td>f756ce1cd2959c31aefbf00613ce741b</td>\n",
       "      <td>20101214</td>\n",
       "      <td>20131107</td>\n",
       "      <td>战略委员会委员</td>\n",
       "      <td>其他</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            fullname                         hash_cust begin_date  end_date  \\\n",
       "5240  首航高科能源技术股份有限公司  f756ce1cd2959c31aefbf00613ce741b   20101214  20131107   \n",
       "5241  首航高科能源技术股份有限公司  f756ce1cd2959c31aefbf00613ce741b   20101214  20131107   \n",
       "5242  首航高科能源技术股份有限公司  f756ce1cd2959c31aefbf00613ce741b   20101214  20131107   \n",
       "5243  首航高科能源技术股份有限公司  f756ce1cd2959c31aefbf00613ce741b   20101214  20131107   \n",
       "5244  首航高科能源技术股份有限公司  f756ce1cd2959c31aefbf00613ce741b   20101214  20131107   \n",
       "\n",
       "           title    lev  \n",
       "5240  薪酬与考核委员会委员     其他  \n",
       "5241  薪酬与考核委员会主任     其他  \n",
       "5242        独立董事  董事会成员  \n",
       "5243     提名委员会委员     其他  \n",
       "5244     战略委员会委员     其他  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "te.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 节点：基金"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'node_funds.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'label'\n",
    "keys = ['fund']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['fund', 'name', 'fund_type', 'invest_type', 'type', 'benchmark', 'market', \n",
    "         'found_date', 'delist_date', 'status', 'label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(378, 11)\n",
      "--------------------\n",
      "1     10658\n",
      "10      186\n",
      "3       139\n",
      "2        37\n",
      "4        11\n",
      "6         5\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T05:41:29.472559Z",
     "start_time": "2020-01-20T05:41:25.448556Z"
    }
   },
   "source": [
    "### 关系: 基金--->托管人 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'rel_fund_has_custodian.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'type'\n",
    "keys = ['fund', 'fullname']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['fund', 'fullname', 'type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 3)\n",
      "--------------------\n",
      "1    11100\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 3)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "te.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T05:41:29.472559Z",
     "start_time": "2020-01-20T05:41:25.448556Z"
    }
   },
   "source": [
    "### 关系: 公募基金 --> 管理人"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'rel_fund_has_management.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'type'\n",
    "keys = ['fund', 'fullname']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['fund', 'fullname', 'type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 3)\n",
      "--------------------\n",
      "1    11112\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 3)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "te.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T05:41:29.472559Z",
     "start_time": "2020-01-20T05:41:25.448556Z"
    }
   },
   "source": [
    "### 关系: 公募基金持仓数据 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'rel_fund_listed_company_portfolio.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = 'type'\n",
    "keys = ['fund', 'fullname']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = ['fund', 'fullname', 'ann_date', 'end_date', 'mkv', 'amount', 'stk_mkv_ratio',\n",
    "         'stk_float_ratio', 'type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(54559, 9)\n",
      "--------------------\n",
      "1    1404484\n",
      "7      29049\n",
      "6      18456\n",
      "5       6777\n",
      "4        208\n",
      "2         69\n",
      "Name: tags, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "te = filter_update_data(filename,path_origin,path_new,labels,keys,names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(54559, 9)"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "te.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fund</th>\n",
       "      <th>fullname</th>\n",
       "      <th>ann_date</th>\n",
       "      <th>end_date</th>\n",
       "      <th>mkv</th>\n",
       "      <th>amount</th>\n",
       "      <th>stk_mkv_ratio</th>\n",
       "      <th>stk_float_ratio</th>\n",
       "      <th>type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1461735</th>\n",
       "      <td>000001</td>\n",
       "      <td>山东天鹅棉业机械股份有限公司</td>\n",
       "      <td>20160827</td>\n",
       "      <td>20160630</td>\n",
       "      <td>39006.62</td>\n",
       "      <td>874.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>IN_PORTFOLIO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1461930</th>\n",
       "      <td>000001</td>\n",
       "      <td>南极电商股份有限公司</td>\n",
       "      <td>20200121</td>\n",
       "      <td>20191231</td>\n",
       "      <td>149832015.87</td>\n",
       "      <td>NotChange</td>\n",
       "      <td>4.31</td>\n",
       "      <td>NotChange</td>\n",
       "      <td>IN_PORTFOLIO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1462074</th>\n",
       "      <td>000001</td>\n",
       "      <td>保利联合化工控股集团股份有限公司</td>\n",
       "      <td>20130328</td>\n",
       "      <td>20121231</td>\n",
       "      <td>8927779.35</td>\n",
       "      <td>363657.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.21</td>\n",
       "      <td>IN_PORTFOLIO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1462075</th>\n",
       "      <td>000001</td>\n",
       "      <td>保利发展控股集团股份有限公司</td>\n",
       "      <td>20200121</td>\n",
       "      <td>20191231</td>\n",
       "      <td>303983999.02</td>\n",
       "      <td>18787639.0</td>\n",
       "      <td>8.74</td>\n",
       "      <td>0.16</td>\n",
       "      <td>IN_PORTFOLIO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1462242</th>\n",
       "      <td>000001</td>\n",
       "      <td>上海钢联电子商务股份有限公司</td>\n",
       "      <td>20200121</td>\n",
       "      <td>20191231</td>\n",
       "      <td>193610279.2</td>\n",
       "      <td>2488564.0</td>\n",
       "      <td>5.57</td>\n",
       "      <td>1.63</td>\n",
       "      <td>IN_PORTFOLIO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           fund          fullname  ann_date  end_date           mkv  \\\n",
       "1461735  000001    山东天鹅棉业机械股份有限公司  20160827  20160630      39006.62   \n",
       "1461930  000001        南极电商股份有限公司  20200121  20191231  149832015.87   \n",
       "1462074  000001  保利联合化工控股集团股份有限公司  20130328  20121231    8927779.35   \n",
       "1462075  000001    保利发展控股集团股份有限公司  20200121  20191231  303983999.02   \n",
       "1462242  000001    上海钢联电子商务股份有限公司  20200121  20191231   193610279.2   \n",
       "\n",
       "             amount stk_mkv_ratio stk_float_ratio          type  \n",
       "1461735       874.0           0.0             0.0  IN_PORTFOLIO  \n",
       "1461930   NotChange          4.31       NotChange  IN_PORTFOLIO  \n",
       "1462074    363657.0          0.15            0.21  IN_PORTFOLIO  \n",
       "1462075  18787639.0          8.74            0.16  IN_PORTFOLIO  \n",
       "1462242   2488564.0          5.57            1.63  IN_PORTFOLIO  "
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "te.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
